// GSE_FS.do
// Inputs: fs_full.csv, gse_full.csv
// Outputs: fs_zip_avg, gse_fs, gse_fs_time_series
// Date last updated: 1/27/2025

********************************************************************************	
*  Aggregate FS data to 3-digit ZIP code level
********************************************************************************

// Import data
	import delimited "$pathr\fs_full.csv", bindquote(strict) clear

// Extract 3-digit zip code from 5-digit
	rename zcta prop_zip
	replace prop_zip= "0" + prop_zip if strlen(prop_zip)==4
	replace prop_zip= "00" + prop_zip if strlen(prop_zip)==3
	replace prop_zip = substr(prop_zip, 1,3) if strlen(prop_zip)==5

// By zip code and tract fips - mean risks
	bys prop_zip: egen flood_zip_avg = mean(fl_floodfactor)
	bys prop_zip: egen heat_zip_avg = mean(ht_heatfactor)
	bys prop_zip: egen wind_zip_avg = mean(hw_windfactor)
	
	bys county_fips: egen flood_county_avg = mean(fl_floodfactor)
	bys county_fips: egen heat_county_avg = mean(ht_heatfactor)
	bys county_fips: egen wind_county_avg = mean(hw_windfactor)
	
// By zip code and tract fips - median risks
	bys prop_zip: egen flood_zip_med = median(fl_floodfactor), autotype
	bys prop_zip: egen heat_zip_med = median(ht_heatfactor), autotype
	bys prop_zip: egen wind_zip_med = median(hw_windfactor), autotype
	
	bys county_fips: egen flood_county_med = median(fl_floodfactor)
	bys county_fips: egen heat_county_med = median(ht_heatfactor)
	bys county_fips: egen wind_county_med = median(hw_windfactor)
	compress

********************************************************************************	
*  Merge with GSE data by zip code
********************************************************************************

// First Street: collapse to one observation per zip code
	collapse (mean) flood_zip* wind_zip* heat_zip* state_fips county_fips, by(prop_zip)
	save "$pathi/fs_zip_avg.dta", replace

// GSE data
	import delimited "$pathr/gse_full.csv", clear

// Clean GSE data
	tostring prop_zip, replace // allows for merging with FS
	drop if prop_zip == "0" | prop_zip == "." // drop areas with no proper zip code or addresses (placeholder)

// Make zip codes with leading zeros
	replace prop_zip = "00" + prop_zip if strlen(prop_zip) == 1
	replace prop_zip = "0" + prop_zip if strlen(prop_zip) == 2

// Merge and save
	merge m:1 prop_zip using "$pathi/fs_zip_avg", keepusing(flood_zip_avg heat_zip_avg wind_zip_avg flood_zip_med heat_zip_med wind_zip_med)
	keep if _merge== 3 // drop unmatched observations
	save "$pathf\gse_fs.dta", replace // save merged data
	
********************************************************************************	
*  Merge in GSE zero balance variables
********************************************************************************
// Import and merge w finished dataset
	import delimited "$pathr/gseZeroBalance.csv", clear
	merge 1:1 loan_id using "$pathf/gse_fs.dta"
	keep if _merge==3
	drop _merge
	compress
	
********************************************************************************	
*  Cleaning: Keep coastal ZIP3s, reformat dates, and define zero balance
********************************************************************************
// Clean and drop unnecessary vars to save space
	drop wind_zip_avg wind_zip_med heat_zip_avg heat_zip_med fire_zip_avg fire_zip_med filename*
	drop if loan_term_orig<0 // only 5 obs
	keep if prop_type=="SF" // single family homes only
		
// Keep loans from coastal states only (saves space & lets us later restrict to whatever state we want)		
	keep if prop_state == "AL" | prop_state == "CA" | prop_state == "CT" | prop_state == "DC" | prop_state == "FL" | prop_state == "DE" | prop_state == "GA" | prop_state == "LA" | prop_state == "MA" | prop_state == "MD" | prop_state == "ME" | prop_state == "MS" | prop_state == "NC" | prop_state == "NH" | prop_state == "NJ" | prop_state == "NY" | prop_state == "OR" | prop_state == "PA" | prop_state == "RI" | prop_state == "SC" | prop_state == "TX" | prop_state == "VA" | prop_state == "WA" | prop_state == "PR"
		
	destring prop_zip, replace
	
// Keep only coastal zip3
	* source: https://qmiusa.com/wp-content/uploads/2021/01/coastal_map_and_zip_codes.pdf
	keep if inlist(prop_zip, 019, 020, 021, 022, 023, 025, 026, 027, 028, 038, 039, 040, 041, 044, 045, 046, 048, 049, 063, 064, 065, 066, 068, 069, 070, 073, 077, 080, 082, 083, 084, 087, 088, 100, 102, 103, 104, 105, 108, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 197, 199, 206, 207, 210, 211, 212, 214, 216, 218, 219, 220, 221, 224, 225, 230, 231, 233, 234, 235, 236, 237, 238, 278, 279, 284, 285, 294, 295, 299, 313, 314, 315, 320, 321, 322, 323, 324, 325, 326, 327, 328, 329, 330, 331, 333, 334, 335, 336, 337, 339, 341, 342, 344, 346, 349, 365, 366, 395, 700, 701, 703, 704, 705, 706, 774, 775, 776, 779, 783, 784, 785, 900, 902, 904, 907, 908, 913, 919, 920, 921, 926, 930, 931, 934, 939, 940, 941, 943, 944, 945, 946, 947, 948, 949, 950, 954, 955, 971, 973, 974, 982, 983, 985, 986) // NOTE: this excludes all PR zip codes

// Put date variables into quarters
	* Close (origination) date
	gen close_yr = round(floor(close_dt / 100))
	gen close_mo = round((close_dt/100 - close_yr) *100)
	gen start_date = ym(close_yr, close_mo)
	gen start_q = qofd(dofm(start_date))
	
	* Zero balance date
	gen end_yr = round(floor(zero_bal_dt / 100))
	gen end_mo = round((zero_bal_dt/100 - end_yr) *100)
	gen end_date = ym(end_yr, end_mo)
	gen end_q = qofd(dofm(end_date))
	
// Define default variable
	gen default = 0
	replace default = 1 if zero_bal_code!=1 & zero_bal_code!=2 & zero_bal_code!=.
	label var default "1 if has reached zero balance and is not because of prepayment/maturity or a third party sale"
	// 	replace default = 1 if zero_bal_code==9 | zero_bal_code==98
	// 	label var default "1 if loan has reached zero balance from REO disposition (foreclosure) or deed-in-lieu of foreclosure"

// Quarterly default variables
	gen default_yr = end_yr 	 if default==1
	gen default_mo = end_mo 	 if default==1
	gen default_date = end_date  if default==1
	gen default_q = end_q		 if default==1
	
	format start_date end_date default_date %tm
	format start_q end_q default_q %tq
	
// Keep only outstanding loans as of 2015q3 (originated but not yet zero balance)
	keep if start_q <= yq(2015, 3) & end_q > yq(2015, 3) // 2 yr window //Sample: 72m->15m
	*keep if start_q <= yq(2016, 3) & end_q > yq(2016, 3) // 1 yr window 
	
// Restrict to states/territories with a major hurricane in Q3 2017
	* Comment out if you want to keep all coastal counties
	keep if prop_state == "TX" | prop_state == "FL" | prop_state == "PR" //15m -> 1.4m
	
// Define a zip as at risk if average FF>=2
	gen byte at_risk = 0
	replace at_risk = 1 if flood_zip_avg >= 2
	
// Define default variables for every quarter q
	forval yr = 2016/2018{
		forval q = 1/4{
			* default (absorbing state)
			gen byte def_`yr'q`q' = 0
			replace def_`yr'q`q' = 1 if end_q <= yq(`yr', `q') & default == 1
		}
	}
	
// Reshape wide to long
	* Create different loan ID since some contain nonnumeric characters
	egen loanID = group(loan_id)
	
	* Keep only needed variables
	keep loanID prop_state prop_zip default_q at_risk def_* //def2_* 
	reshape long def_, i(loanID) j(quarter, string)
	*reshape long def_ def2_, i(loanID) j(quarter, string)
	rename def_ def
	*rename def2_ def2
	label var def "1 if has reached zero balance and is not because of prepayment/maturity or a third party sale (absorbing state)"
	*label var def2 "Same as def, except that def2=. after loan became defaulted"
	
// Set as time series
	* Get quarter variable in proper format
	gen t = quarterly(quarter, "YQ")
	format t %tq
	
	* Set panel
	xtset loanID t
	
	* Define post 2017q3 hurricane season
	gen post = t>= yq(2017,3)

// Save final dataset: 
	save "$pathf\gse_fs_time_series.dta", replace 